When removing the gene ENSG00000187951 there was an important change in the WGCNA modules, and when analysing what could have caused this, we found that this gene had a very big outlier value for a single sample, which we think altered the values of the other genes during the vst transformation enough to alter the modules by the WGNCA algorithm (See 20_02_28_comparison.html for more details about this).

Since the preprocessing pipeline did not catch this behaviour and it ended up causing problems in downstream analysis, we want to see how often we can find these type of behaviours in the data and if/how much they alter the results of WGCNA modules.

To do this, for each gene, we’ll calculate its variance leaving one sample out, repeating this for all samples and then calculating the variance of these estimates. A high variance would point to the existance of an outlier value.

I’ll use the filtered raw data obtained with the 20_02_06_data_preprocessing.html pipeline, where genes with low levels of expression as well are samples with weird behaviours are supposed to have been filtered out already

load('./../../AllRegions/Data/filtered_raw_data_old.RData')
datExpr_raw = datExpr
datGenes_raw = datGenes
load('./../../AllRegions/Data/preprocessed_data.RData')
DE_info = DE_info %>% data.frame %>% mutate('ID' = rownames(datExpr), 'DE' = padj<0.05) %>%
          dplyr::select(ID, log2FoldChange, padj, DE)
datExpr = datExpr_raw
datGenes = datGenes_raw

rm(datExpr_raw, dds)

SD of leave 1 out SD

sd_leave1out_sd_by_gene = data.frame('ID'=rownames(datExpr),
                                     'sd_l1o'=apply(datExpr, 1, function(x){sd(sapply(1:ncol(datExpr), function(s) sd(x[-s])))})) %>%
                          left_join(DE_info, by = 'ID')

ENSG00000187951_val = sd_leave1out_sd_by_gene[sd_leave1out_sd_by_gene$ID == 'ENSG00000187951',2]

cat(paste0('Gene ENSG00000187951 has a value of ', round(ENSG00000187951_val,2)))
## Gene ENSG00000187951 has a value of 16.62
cat(paste0('There are ', sum(sd_leave1out_sd_by_gene$sd_l1o>ENSG00000187951_val), ' genes with a value higher than this (~',
           round(100*mean(sd_leave1out_sd_by_gene$sd_l1o>ENSG00000187951_val),1),'%)'))
## There are 497 genes with a value higher than this (~3.1%)
summary(sd_leave1out_sd_by_gene$sd_l1o)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    0.008    0.398    1.075    3.758    2.802 4226.243
p = sd_leave1out_sd_by_gene %>% ggplot(aes(ID, sd_l1o+1, color = DE)) + geom_point(alpha=0.3) + 
                                       geom_hline(yintercept = ENSG00000187951_val, color='gray') + 
                                       xlab('Genes') + ylab('') +  scale_y_log10() + theme_minimal() + 
                                       theme(axis.text.x=element_blank(), axis.ticks.x=element_blank(),
                                             panel.grid.major = element_blank())
ggExtra::ggMarginal(p, type = 'density', margins='y', groupColour=TRUE, groupFill=TRUE, size=10)

Top genes

top_genes = sd_leave1out_sd_by_gene %>% arrange(desc(sd_l1o)) %>% top_n(n=21, w=sd_l1o) %>% 
            left_join(datGenes, by=c('ID' = 'ensembl_gene_id'))  %>% dplyr::select(ID, hgnc_symbol, sd_l1o)

kable(top_genes, caption='Top genes with the highest sd of leave 1 out sd')
Top genes with the highest sd of leave 1 out sd
ID hgnc_symbol sd_l1o
ENSG00000198804 MT-CO1 4226.2426
ENSG00000118785 SPP1 937.9286
ENSG00000198938 MT-CO3 890.2146
ENSG00000198712 MT-CO2 714.6016
ENSG00000131095 GFAP 705.7251
ENSG00000198886 MT-ND4 600.7525
ENSG00000110436 SLC1A2 486.6570
ENSG00000197971 MBP 425.5593
ENSG00000198786 MT-ND5 363.9253
ENSG00000198727 MT-CYB 362.5641
ENSG00000087086 FTL 283.8756
ENSG00000086570 FAT2 253.8446
ENSG00000198888 MT-ND1 239.5533
ENSG00000198899 MT-ATP6 232.7214
ENSG00000123560 PLP1 229.9541
ENSG00000131018 SYNE1 209.3978
ENSG00000198668 CALM1 191.2898
ENSG00000075624 ACTB 191.0754
ENSG00000198840 MT-ND3 174.4450
ENSG00000163046 ANKRD30BL 154.7978
ENSG00000134982 APC 154.4011
plot_function = function(i){
  i = 3*i-2
  plot_list = list()
  for(j in 1:3){
    plot_data = data.frame('sample' = colnames(datExpr), 'expr' = t(datExpr[top_genes$ID[i+j-1],]), 'Diagnosis' = datMeta$Diagnosis)
    colnames(plot_data)[2] = 'expr'
    plot_list[[j]] = ggplotly(plot_data %>% ggplot(aes(sample, expr, color=Diagnosis)) + 
                              geom_point() + theme_minimal() + 
                              theme(legend.position='none', axis.text.x=element_blank(), axis.ticks.x=element_blank()))
  }
  p = subplot(plot_list, nrows=1) %>% layout(annotations = list(
    list(x = 0.1 , y = 1.05, text =  top_genes$hgnc_symbol[i], showarrow = F, xref='paper', yref='paper'),
    list(x = 0.5 , y = 1.05, text = top_genes$hgnc_symbol[i+1], showarrow = F, xref='paper', yref='paper'),
    list(x = 0.9 , y = 1.05, text = top_genes$hgnc_symbol[i+2], showarrow = F, xref='paper', yref='paper')))
  
  return(p)
}

plot_function(1)
plot_function(2)
plot_function(3)
plot_function(4)
plot_function(5)
plot_function(6)
plot_function(7)


Z-score based analysis

max_z_score = data.frame('ID' = rownames(datExpr), 'z_score' = apply(datExpr,1,function(x) max((x-mean(x))/sd(x))),
                         'outlier_sample' = apply(datExpr, 1, function(x) datMeta$Sample_ID[which.max(abs(x-mean(x))/sd(x))])) %>%
              left_join(DE_info, by='ID')

ENSG00000187951_val = max_z_score[max_z_score$ID == 'ENSG00000187951',2]

cat(paste0('Gene ENSG00000187951 has a value of ', round(ENSG00000187951_val,1)))
## Gene ENSG00000187951 has a value of 8.8
cat(paste0('There are ', sum(max_z_score$z_score>ENSG00000187951_val), ' genes with a value higher than this (~',
           round(100*mean(max_z_score$z_score>ENSG00000187951_val),2),'%)'))
## There are 3 genes with a value higher than this (~0.02%)
summary(max_z_score$z_score)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.568   2.676   3.181   3.475   4.003   8.819
p = max_z_score %>% ggplot(aes(ID, z_score, color = DE)) + geom_point(alpha=0.3) + 
                           geom_hline(yintercept = ENSG00000187951_val, color='gray') + 
                           xlab('Genes') + ylab('Max Z-score') + theme_minimal() + 
                           theme(axis.text.x=element_blank(), axis.ticks.x=element_blank(),
                                 legend.position='none', panel.grid.major = element_blank())

ggExtra::ggMarginal(p, type = 'density', margins='y', groupColour=TRUE, groupFill=TRUE, size=10)

Top 20 genes

top_genes = max_z_score %>% dplyr::top_n(n=21, w=z_score) %>% arrange(desc(z_score)) %>% 
            left_join(datGenes, by=c('ID' = 'ensembl_gene_id')) %>%
            dplyr::select(ID, hgnc_symbol, log2FoldChange, padj, DE, z_score)

kable(top_genes, caption='Top 20 genes with the highest max z-score value')
Top 20 genes with the highest max z-score value
ID hgnc_symbol log2FoldChange padj DE z_score
ENSG00000086570 FAT2 1.2019142 0.0020615 TRUE 8.819450
ENSG00000124493 GRM4 0.1435437 0.2984922 FALSE 8.806634
ENSG00000181541 MAB21L2 0.5391928 0.3724211 FALSE 8.803135
ENSG00000187951 ARHGAP11B NA NA NA 8.801469
ENSG00000180347 CCDC129 0.1538918 0.7154404 FALSE 8.787919
ENSG00000164220 F2RL2 0.6499642 0.0886975 FALSE 8.752194
ENSG00000196092 PAX5 0.6662045 0.0779792 FALSE 8.746626
ENSG00000123572 NRK 0.3115349 0.4525888 FALSE 8.729920
ENSG00000165646 SLC18A2 -0.4914993 0.2622890 FALSE 8.691294
ENSG00000074370 ATP2A3 0.4635054 0.0030267 TRUE 8.673706
ENSG00000214694 ARHGEF33 0.1401980 0.1035356 FALSE 8.663063
ENSG00000187017 ESPN 0.2351648 0.6632260 FALSE 8.649915
ENSG00000132130 LHX1 -0.0584408 0.9152006 FALSE 8.646916
ENSG00000187595 ZNF385C 0.4673824 0.0044824 TRUE 8.631137
ENSG00000171219 CDC42BPG 0.5819641 0.1224602 FALSE 8.602846
ENSG00000066230 SLC9A3 -0.1463554 0.7968300 FALSE 8.507227
ENSG00000108001 EBF3 0.5071822 0.2508842 FALSE 8.493193
ENSG00000129277 CCL4 0.0842550 0.8553859 FALSE 8.470016
ENSG00000133063 CHIT1 -0.2575575 0.4639968 FALSE 8.463125
ENSG00000221818 EBF2 0.1693046 0.7626034 FALSE 8.459121
ENSG00000129167 TPH1 0.0902837 0.8168627 FALSE 8.449049
plot_function(1)
plot_function(2)
plot_function(3)
plot_function(4)
plot_function(5)
plot_function(6)
plot_function(7)

Taking all the genes that have at least one sample with a z-score larger than 6

samples_info = table(max_z_score$outlier_sample[max_z_score$z_score>6]) %>% data.frame %>% filter(Freq>0) %>%
               rename(Var1 = 'Sample_ID', Freq = 'Count') %>% left_join(datMeta, by='Sample_ID') %>%
               dplyr::select(Sample_ID, Brain_lobe, Sex, Age, Batch, PMI, Diagnosis, Count) %>% arrange(desc(Count))

cat(paste0(sum(max_z_score$z_score>6), ' genes have a max z-score larger than 6'))
## 591 genes have a max z-score larger than 6
kable(samples_info, caption = 'Samples with the most outliers considering genes with a z-score larger than 6')
Samples with the most outliers considering genes with a z-score larger than 6
Sample_ID Brain_lobe Sex Age Batch PMI Diagnosis Count
AN02987_BA38 Temporal M 15 10.10.2014 30.80 ASD 158
AN02987_BA7 Parietal M 15 10.10.2014 30.80 ASD 101
AN11796_BA38 Temporal M 14 10.10.2014 2.35 ASD 81
AN17515_BA38 Temporal M 54 10.10.2014 28.25 ASD 62
AN00493_BA17 Occipital M 27 10.10.2014 8.30 ASD 40
AN00493_BA4_6 Frontal M 27 6.20.2014 8.30 ASD 35
AN17254_BA38 Temporal M 51 10.10.2014 22.16 ASD 27
AN00764_BA4_6 Frontal M 20 10.10.2014 23.70 ASD 20
AN04166_BA38 Temporal M 24 6.20.2014 18.51 ASD 13
AN01971_BA17 Occipital M 39 10.10.2014 31.50 ASD 8
AN17254_BA7 Parietal M 51 10.10.2014 22.16 ASD 8
AN08792_BA7 Parietal M 30 10.10.2014 20.30 ASD 7
AN12457_BA17 Occipital F 29 6.20.2014 17.83 ASD 7
AN08166_BA4_6 Frontal M 28 10.10.2014 43.25 ASD 4
AN07176_BA4_6 Frontal M 21 10.10.2014 29.91 CTL 3
AN08166_BA7 Parietal M 28 10.10.2014 43.25 ASD 2
AN09730_BA17 Occipital M 22 10.10.2014 25.00 ASD 2
AN13872_BA4_6 Frontal F 5 6.20.2014 33.00 ASD 2
AN00764_BA17 Occipital M 20 10.10.2014 23.70 ASD 1
AN04166_BA7 Parietal M 24 10.10.2014 18.51 ASD 1
AN07444_BA4_6 Frontal M 17 10.10.2014 30.75 CTL 1
AN08043_BA17 Occipital F 52 6.20.2014 39.15 ASD 1
AN08792_BA38 Temporal M 30 6.20.2014 20.30 ASD 1
AN08873_BA17 Occipital M 5 10.10.2014 25.50 ASD 1
AN08873_BA7 Parietal M 5 10.10.2014 25.50 ASD 1
AN10723_BA38 Temporal M 60 10.10.2014 24.23 CTL 1
AN17425_BA17 Occipital M 16 10.10.2014 26.16 CTL 1
AN19511_BA38 Temporal M 8 6.20.2014 22.20 ASD 1
AN19760_BA4_6 Frontal M 28 6.20.2014 23.25 CTL 1

There is not a big difference in the average z-score of the genes in the outlier samples, the most notable difference is in its outliers

plot_data = data.frame('ID' = rownames(datExpr))
outlier_samples = samples_info$Sample_ID[samples_info$Count>5]

for(s in outlier_samples){
  outlier_idx = which(datMeta$Sample_ID==s)
  z_scores = apply(datExpr, 1, function(x) (abs(x[outlier_idx]-mean(x)))/sd(x))
  plot_data = cbind(plot_data, z_scores)
}
colnames(plot_data)[-1] = as.character(outlier_samples)

set.seed(123)
average_sample = sample(datMeta$Sample_ID[! datMeta$Sample_ID %in% outlier_samples],1) %>% as.character
cat(paste0('Using random sample ', average_sample, ' as a reference'))
## Using random sample AN09730_BA4_6 as a reference
random_sample_idx = which(datMeta$Sample_ID==average_sample)

plot_data_melt = plot_data %>% mutate('Random Sample'=apply(datExpr, 1, function(x) abs(x[random_sample_idx]-mean(x))/sd(x))) %>% 
                 melt() %>% left_join(datMeta %>% dplyr::select(Sample_ID, Diagnosis), by = c('variable'='Sample_ID')) %>% 
                 mutate(variable=factor(variable, levels=c(unique(as.character(samples_info$Sample_ID)),'Random Sample'), ordered=T)) %>%
                 mutate(Diagnosis = ifelse(variable=='Random Sample', as.character(datMeta$Diagnosis[datMeta$Sample_ID==average_sample]), as.character(Diagnosis)))

ggplotly(plot_data_melt %>% ggplot(aes(variable, value, fill=Diagnosis)) + geom_boxplot() + xlab('Samples') + 
         ylab('|z-scores|') + theme_minimal() + theme(axis.text.x = element_text(angle = 90, hjust = 1)))
rm(plot_data, plot_data_melt, s, outlier_idx, z_scores, average_sample, random_sample_idx)

Even though most of the outliers come from the same group of samples, they weren’t filtered out when filtering outlier samples

datExpr_filtered = datExpr
datMeta_filtered = datMeta

This was the plot used to filter outlier samples. Some of the outlier samples we have found are close to the threshold of -2, but some aren’t, os they wouldn’t have been filtered out even if the threshold would have been higher

# Load original expression datasets
datExpr = read.csv('./../Data/RNAseq_ASD_datExpr.csv', row.names=1)
datMeta = read.csv('./../Data/RNAseq_ASD_datMeta.csv')

datMeta = datMeta %>% mutate(Brain_Region = as.factor(Region)) %>% 
                      mutate(Brain_lobe = case_when(Brain_Region == 'BA4_6' ~ 'Frontal',
                                                    Brain_Region == 'BA7'   ~ 'Parietal',
                                                    Brain_Region == 'BA38'  ~ 'Temporal',
                                                    TRUE ~ 'Occipital')) %>%
                      mutate(Batch = as.factor(gsub('/', '.', RNAExtractionBatch)), 
                             Diagnosis = factor(Diagnosis_, levels=c('CTL','ASD'))) %>% 
                      dplyr::select(-Diagnosis_)

# Filtering genes: These filters would be the same, so I'll just keep the genes present in the 
# filtered dataset instead of repeating the whole filtering
datExpr = datExpr[rownames(datExpr) %in% rownames(datExpr_filtered),]

# Filtering samples
absadj = datExpr %>% bicor %>% abs
## alpha: 1.000000
netsummary = fundamentalNetworkConcepts(absadj)
ku = netsummary$Connectivity
z.ku = (ku-mean(ku))/sqrt(var(ku))

# Plot results
plot_data = data.frame('sample'=1:length(z.ku), 'distance'=z.ku, 'Sample_ID'=datMeta$Sample_ID, 
                       'Subject_ID'=datMeta$Subject_ID, 'Extraction_Batch'=datMeta$RNAExtractionBatch,
                       'Brain_Lobe'=datMeta$Brain_lobe, 'Sex'=datMeta$Sex, 'Age'=datMeta$Age,
                       'Diagnosis'=datMeta$Diagnosis, 'PMI'=datMeta$PMI) %>% 
            mutate(outlier = Sample_ID %in% samples_info$Sample_ID[samples_info$Count>10]) %>%
            left_join(samples_info %>% dplyr::select(Sample_ID, Count), by='Sample_ID') %>% 
            dplyr::rename('Number of Outliers' = Count) %>% 
            mutate(`Number of Outliers` = ifelse(is.na(`Number of Outliers`), 0, `Number of Outliers`))

ggplotly(plot_data %>% ggplot(aes(sample, distance, color=Diagnosis)) + 
         geom_point(alpha=plot_data$outlier/2+.2, aes(id=Subject_ID)) +
         geom_hline(yintercept = -2, color = 'gray') + theme_minimal() +
         ggtitle('Original position of the outlier genes wrt all the other samples'))
selectable_scatter_plot(plot_data, plot_data[,-c(1,2)])